Setup and preprocessing

library(here)
library(rethinking)
library(tidyverse); print(packageVersion("tidyverse"))
library(phyloseq)
library(bayesplot)
library(tidybayes)
library(ggbeeswarm)
library(magrittr)
library(tsibble)
library(rstan)
library(ggridges)
library(patchwork)
# library(kableExtra)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

fig_fp <- params$fig_fp
if (!dir.exists(here(fig_fp))) {
  dir.create(here(fig_fp))
}

source(here("shared_functions.R"))

mira_theme <- theme_bw() + theme(
  strip.background = element_rect(fill = NA)
)
theme_set(mira_theme)

We first load all the data into one object. This contains metadata related to each specimen, and the sequence and taxonomy table for all amplicon sequence variants (ASVs).

mira.all <- load_mira_data(
  seqtab_fp = here("../shared/data/seqtab.rds"),
  taxa_fp = here("../shared/data/taxa.rds"),
  meds_fp = here("../shared/data/MIRA_Medications_Table.csv"),
  specimen_fp = here("../shared/data/MIRA_Specimen_Table.csv")
)

seqs <- mira.all$seqs
meds <- mira.all$meds

Here we filter and preprocess the data. We first remove non-informative samples, merge duplicates, and restrict the data to the taxa of interest (Staphylococcus aureus). Next, we calculate our “antibiotic effectiveness window” for each subject and the antibiotic of interest (vancomyciniv).

mira <- mira.all$ps
# Remove non-MIRA subjects from dataset (incl. blanks and controls)
.nonmira <- is.na(sample_data(mira)$subject_id)
print(sprintf("Removing %d non-MIRA samples...", sum(.nonmira)))
mira <- prune_samples(!.nonmira, mira)
# Remove culture samples (they break the subject/type/date unique constraint)
.culture <- grepl("Culture", sample_data(mira)$specimen_type)
print(sprintf("Removing %d culture samples...", sum(.culture)))
mira <- prune_samples(!.culture, mira)
# Remove empty samples
.empty <- sample_sums(mira) == 0
print(sprintf("Removing %d empty samples...", sum(.empty)))
mira <- prune_samples(!.empty, mira)

# Identify "duplicated" specimens (same subject, specimen type, and study day)
sample_data(mira) <- sample_data(mira) %>% 
  group_by(subject_id, specimen_type, study_day) %>%
  # "specimen_id3" has the same value for duplicated specimens so phyloseq can 
  # use it as a grouping level
  mutate(specimen_id3 = case_when(
    n() > 1 ~ paste0(first(as.character(specimen_id2)), "_D"),
    TRUE ~ as.character(specimen_id2)
  )) %>%
  ungroup() %>% as.data.frame() %>%
  set_rownames(.$specimen_id2)

# Sum abundances and merge sample table for duplicates
mira <- phyloseq::merge_samples(mira, "specimen_id3", fun=sum)
# Re-add the relevant metadata since merge_samples mangles factors and dates
sample_data(mira) <- sample_data(mira) %>% 
  mutate(specimen_id2 = rownames(.)) %>%
  select(specimen_id2, specimen_id3) %>%
  left_join(mira.all$samples) %>%
  ungroup() %>% as.data.frame() %>%
  set_rownames(.$specimen_id2)

# Restrict to only samples for which we have abx data
.abx_specimens <- as.character(inner_join(sample_data(mira), meds)$specimen_id2)
mira.abx <- prune_samples(
  sample_data(mira)$specimen_id2 %in% .abx_specimens, mira)

# Converted to melted form
agg <- phyloseq_to_agglomerated(mira.abx, "specimen_id2", "otu_id", "read_count")
d <- agg %>%
  # Calculate total reads
  group_by(specimen_id2) %>%
  mutate(total_reads = sum(read_count)) %>%
  ungroup() %>%
  # Collapse reads by genus/species
  filter(!is.na(Genus), !is.na(Species)) %>%
  group_by(specimen_id2, total_reads, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>%
  summarise(read_count = sum(read_count)) %>%
  ungroup() %>%
  filter(Genus == params$genus) %>%
  filter(Species == params$species) %>%
  left_join(sample_data(mira.abx)) %>%
  select(-c(Kingdom:Family))

# Calculate antibiotic effectiveness windows
subjects <- sample_data(mira.abx) %>%
  group_by(subject_id) %>% 
  mutate(exit_date = max(collection_date)) %>%
  distinct(subject_id, enroll_date, exit_date) %>%
  right_join(meds) %>%
  group_by(subject_id) %>%
  mutate(study_day = as.integer(collection_date - enroll_date)) %>%
  mutate(exit_day = study_day[collection_date == exit_date]) %>%
  # Limit to only a week before enrollment date and nothing after
  filter(study_day > -3, collection_date <= exit_date) %>%
  mutate(abx_yn = grepl(params$abx, abx_b)) %>%
  # the .size parameter here is how long it takes to reach peak
  # 1 = that day
  mutate(reached_peak = slide_lgl(abx_yn, all, .size=2)) %>%
  # the lag parameter here defines how long it lasts after end of admin.
  mutate(on_abx = effective_window(reached_peak, lag=1)) %>%
  ungroup()
  
# Manually split MIRA_024 into two sub-subjects
subjects <- subjects %>%
  mutate(subject_id2 = case_when(
    subject_id != "MIRA_024" ~ subject_id,
    study_day <= 33 ~ "MIRA_024a",
    study_day >= 73 ~ "MIRA_024b",
    TRUE ~ NA_character_
  )) %>%
  filter(!is.na(subject_id2)) %>%
  mutate(study_day = case_when(
    subject_id2 == "MIRA_024b" ~ study_day - 73,
    TRUE ~ as.double(study_day)
  )) %>%
  mutate(exit_day = ifelse(subject_id2 == "MIRA_024b", exit_day-73, exit_day))

d2 <- left_join(subjects, d)

The following figure shows the timecourses for each subject, including the antibiotic administration periods and the days each specimen type was collected.

Model v1.5

We will use a binomial model of reads attributable to our bacteria of interest in a given specimen \(i\) from a subject \(j\) (\(y_{ij}\)), given the number of total reads in the specimen (\(T_i\)) and some probability \(p_{ij}\) of seeing that bacteria: \[ y_{ij} \sim \text{Binom}(T_i,\ p_{ij}) \] We model the probability \(p_{ij}\) as a linear combination of our predictors, which are previous day’s abundance of that bacteria in subject \(\beta_{\text{prev}}\) and whether or not the subject is under the effects of antibiotics \(\beta_{\text{abx}_j}\). In addition, we include a global intercept \(\alpha_0\), a subject-specific intercept \(\alpha_j\), and a specimen-specific intercept \(\alpha_i\). These sub-intercepts allow the model to account for varying starting values for each subject (\(\alpha_j\)), and varying sequencing effort in each specimen (\(\alpha_i\)). \[ \text{logit}(p_{ij}) = \alpha_0 + \alpha_j + \alpha_i + \beta_{\text{prev}}+ \beta_{\text{abx}_j} \] The \(\text{logit}(p_{ij})\) term in the model transforms our linear combination of predictors into a probability between 0-1 so we can use it in the binomial model above.

While we don’t know the values of these parameters yet (hence the model), we can describe a probability distribution for each of them that we expect will contain the true value based on our prior expectations.

First, we guess that our global parameters, \(\alpha_0\) and \(\beta_{\text{prev}}\), come from a normal distribution centered around 0 (i.e, they could be positive or negative). \[ \alpha_0 \sim \text{Normal}(0,\ 1) \\ \beta_{\text{prev}} \sim \text{Normal}(0,\ 1) \]

The same goes for our subject-level parameters, \(\alpha_j\) and \(\beta_{\text{abx}_j}\). Each subject will get its own normal distribution centered at 0, but the variance parameter \(\sigma\) of those distributions will be shared between subjects. This leads to partial pooling of the subject data, so that subjects with less data will be pulled towards the population median. We will model \(\sigma\) with a half-Cauchy distribution, which looks like a normal distribution but allows for more extreme values and is restricted to values greater than 0. \[ \alpha_j \sim \text{Normal}(0,\ \sigma_j) \\ \sigma_j \sim \text{Half-Cauchy}(0,\ 1) \\ \beta_{\text{abx}_j} \sim \text{Normal}(0,\ \sigma_{\text{abx}}) \\ \sigma_{\text{abx}} \sim \text{Half-Cauchy}(0,\ 1) \]

Finally, the specimen-level intercepts are also modeled the same way, with partial pooling. This means that the specimens with only a few reads (and hence higher uncertainty) are adjusted towards the median specimen intercept. \[ \alpha_i \sim \text{Normal}(0,\ \sigma_i) \\ \sigma_i \sim \text{Half-Cauchy}(0,\ 1) \]

Model v1.5.0

Straightforward implementation of the above model:

alist(
  read_counts ~ dbinom(total_reads, prob),
  logit(prob) <-  a0 + a_subj[subject] + a_spec[specimen] + b_lag*lag_prop + b_abx[subject]*on_abx,
  # Global parameters
  a0 ~ dnorm(0, 1),
  b_lag ~ dnorm(0, 1),
  # Subject-level parameters
  a_subj[subject] ~ dnorm(0, sigma_subj),
  sigma_subj ~ dcauchy(0, 1),
  b_abx[subject] ~ dnorm(0, sigma_abx),
  sigma_abx ~ dcauchy(0, 1),
  # Specimen-level parameters
  a_spec[specimen] ~ dnorm(0, sigma_spec),
  sigma_spec ~ dcauchy(0, 1)
)

To prepare the data, we restrict it to just the sputum specimens, and add variables for the previous day’s proportional abundance.

d1.5.0.sp <- d2 %>% filter(specimen_type == "Sputum") %>%
  select(subject_id=subject_id2, specimen_id2, study_day, total_reads, read_count, abx_yn, on_abx) %>%
  # Remove empty or single-sample subjects
  group_by(subject_id) %>%
  # filter(n() > 1) %>%
  arrange(subject_id, study_day) %>%
  # Add lagged read and empirical proportion terms
  mutate(lag_count = lag(read_count)) %>%
  mutate(lag_emp_prop = lag(read_count)/lag(total_reads)) %>%
  # Remove missing cases 
  filter(!is.na(lag_count) & !is.na(read_count)) %>%
  ungroup() %>%
  droplevels()
d1.5.0.spl <- compose_data(d1.5.0.sp)

This plot shows the data being input into the model, as proportions. Blue indicates periods when antibiotics are considered active, as determined earlier.

And here is a slice of the data showing just MIRA_012:

.dm12 <- d1.5.0.sp %>% filter(subject_id=="MIRA_012") %>% 
  select(subject_id, study_day, read_count, total_reads, on_abx, lag_emp_prop)
kable(.dm12)
subject_id study_day read_count total_reads on_abx lag_emp_prop
MIRA_012 1 12322 156193 FALSE 0.0001739
MIRA_012 2 28715 101267 FALSE 0.0788896
MIRA_012 3 21966 168130 FALSE 0.2835573
MIRA_012 4 9420 118728 FALSE 0.1306489
MIRA_012 7 7241 175965 FALSE 0.0793410
MIRA_012 8 4461 125232 FALSE 0.0411502
MIRA_012 9 5374 132877 FALSE 0.0356219
MIRA_012 10 140 18171 FALSE 0.0404434
MIRA_012 11 24541 298645 FALSE 0.0077046
MIRA_012 14 19934 102019 FALSE 0.0821745
MIRA_012 17 11361 131557 FALSE 0.1953950
MIRA_012 18 101 2499 FALSE 0.0863580
MIRA_012 21 41298 279284 TRUE 0.0404162
MIRA_012 22 408 6405 TRUE 0.1478710
MIRA_012 23 1771 11692 FALSE 0.0637002
MIRA_012 24 3222 64496 FALSE 0.1514711
MIRA_012 25 673 13990 FALSE 0.0499566
MIRA_012 28 2986 33616 FALSE 0.0481058
MIRA_012 30 17691 100509 TRUE 0.0888267
MIRA_012 31 0 148 TRUE 0.1760141
MIRA_012 32 9636 63630 TRUE 0.0000000
MIRA_012 35 12769 67277 FALSE 0.1514380
MIRA_012 36 14337 130406 FALSE 0.1897974
MIRA_012 37 0 16145 FALSE 0.1099413
ggplot(.dm12, aes(x=study_day, y=read_count/total_reads, color=on_abx, group=subject_id)) +
  geom_point(shape=21) +
  geom_line() +
  scale_alpha_manual(values=c("TRUE"=0.6, "FALSE"=0), na.value=0) +
  scale_y_continuous(labels=scales::percent, expand=c(0.2,0)) +
  scale_color_manual(values=c("TRUE"="dodgerblue", "FALSE"="grey40")) +
  mira_theme +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "none") +
  labs(x="Study day", y="Proportional abundance")

The Stan code for the model:

writeLines(readLines("arb_1.5.0.stan"))
data{
  int<lower=1> n;
  int<lower=1> n_subject_id;
  int read_count[n];
  vector[n] lag_emp_prop;  // previous day's empirical proportions 
  int total_reads[n];
  int specimen_id2[n];
  int subject_id[n];
  vector[n] on_abx; // whether the subject is under the effects of abx on this timepoint
}
parameters{
  vector[n] a_spec_std;
  vector[n_subject_id] a_subj_std;
  vector[n_subject_id] b_abx_std;
  real mu;
  real<lower=0> sigma_spec;
  real<lower=0> sigma_subj;
  real<lower=0> sigma_abx;
  real b_lag;
}
model{
  vector[n] prob;
  // Global intercept term
  mu ~ normal(-1, 1);
  // Sigma and unit normal intercepts for specimen
  sigma_spec ~ cauchy(0, 1);
  a_spec_std ~ normal(0, 1);
  // Sigma and unit normal intercepts for subject
  sigma_subj ~ cauchy(0, 1);
  a_subj_std ~ normal(0, 1);
  // Normal prior for lag coefficient
  b_lag ~ normal(0, 1);
  // Normal priors for abx coefficient (no pooling)
  sigma_abx ~ cauchy(0, 1);
  b_abx_std ~ normal(0, 1);
  // Full model
  for (i in 1:n) {
    prob[i] = mu + sigma_spec * a_spec_std[specimen_id2[i]] + sigma_subj * a_subj_std[subject_id[i]] +
    b_lag * lag_emp_prop[i] + sigma_abx * b_abx_std[subject_id[i]] * on_abx[i];
  }
  read_count ~ binomial_logit(total_reads, prob);
}
generated quantities {
  vector[n] a_spec = sigma_spec * a_spec_std;
  vector[n_subject_id] a_subj = sigma_subj * a_subj_std;
  vector[n_subject_id] b_abx = sigma_abx * b_abx_std;
}

Fitting the model:

if (params$load_saved) {
  m1.5.0.sp <- readRDS("fit_arb_1.5.0_sputum.rds")
} else {
  m1.5.0.sp <- stan(
    file="arb_1.5.0.stan", data=d1.5.0.spl, iter=2000, chains=4,
    control=list(max_treedepth=15, adapt_delta=0.9))
  saveRDS(m1.5.0.sp, file="fit_arb_1.5.0_sputum.rds")
}

Let’s look at the posterior estimates for the global parameters, including the \(\sigma\) terms:

m1.5.0.sp %>%
  recover_types(d1.5.0.sp) %>%
  gather_samples(mu, b_lag, sigma_spec, sigma_subj, sigma_abx) %>%
  ggplot(aes(x = estimate, y=term, fill=0.5-abs(0.5-..ecdf..))) +
  geom_vline(aes(xintercept=0), linetype=3) +
  stat_density_ridges(geom="density_ridges_gradient", calc_ecdf=TRUE, scale=1.5) +
  scale_fill_viridis_c("Probability", direction=1, alpha=0.7, option="C") +
  scale_x_continuous(expand=c(0,0)) +
  labs(x="Parameter estimate", y="Parameter", title="Global parameter estimates")

There is a huge amount of variance in the sigma_subj term, indicating a wide spread of intercepts for the various subjects, and a good indication that the subject-level intercepts were warranted. There is much less effect for the sigma_abx term, suggesting that the effects of antibiotics were similar between subjects.

Next, let’s look at the estimates for the \(\beta_{\text{abx}}\) terms, which is roughly the effect of antibiotics on the abundance of the bacteria.

m1.5.0.sp %>%
  recover_types(d1.5.0.sp) %>%
  spread_samples(b_abx[subject_id]) %>%
  ggplot(aes(x = b_abx, y=fct_rev(subject_id), fill=0.5-abs(0.5-..ecdf..))) +
  stat_density_ridges(geom="density_ridges_gradient", calc_ecdf=TRUE, scale=1.5) +
  geom_vline(aes(xintercept=0), linetype=2, alpha=0.4) +
  scale_fill_viridis_c("Probability", direction=1, option="C") +
  scale_x_continuous(expand=c(0,0), limits=c(-2,2)) +
  scale_y_discrete(expand=c(0,1.5)) +
  labs(x="Parameter estimate", y="Subject", title="Antibiotic term estimates")

Most of these estimates are centered around zero with long tails, which suggests that there is not a strong effect of antibiotics. The one real outlier is MIRA_013.

The best way to understand these terms is to effectively simulate “new” timecourses from these estimates.

invisible(with(data.frame(), {
    # browser()
    .m <- m1.5.0.sp
    .d <- d1.5.0.sp
    .m <- recover_types(.m, .d)
    post <- spread_samples(.m, mu, sigma_spec, sigma_subj, sigma_abx, b_lag, 
        a_spec[specimen_id2], a_subj[subject_id], b_abx[subject_id])
    post$specimen_idx <- as.integer(post$specimen_id2)
    post2 <- left_join(post, .d) %>% filter(!is.na(read_count))
    
    # With specimen-specific intercepts
    pp_subject_spec_plots <- lapply(X = as.character(unique(post$subject_id)), 
        function(s) {
            print(s)
            pp_subject_1.5.0(post2, s, .d, TRUE)
        })
    pp_subject_spec_plots <- Reduce(`+`, pp_subject_spec_plots)
    plot(pp_subject_spec_plots)
    
    # Without specimen-specific intercepts
    pp_subject_plots <- lapply(X = as.character(unique(post$subject_id)), function(s) {
        print(s)
        pp_subject_1.5.0(post2, s, .d, FALSE)
    })
    pp_subject_plots <- Reduce(`+`, pp_subject_plots)
    plot(pp_subject_plots)
    
    # Subject MIRA_012 specifically:
    mira12 <- pp_subject_1.5.0(post2, "MIRA_012", .d, FALSE)
    plot(mira12)
}))